This notebook documents the creation of a B-ALL classifier constructed from two datasets ( St.Judes and Lund). The classifier will be a multi-class classifier utilisting the Random Forest algorithm native as implemented initially in Fortran and then R. By utilising two different datasets we hope to mitigate batch specific effects and biases as well as improve statistical power and classification clout.\
We have the log FPKM dataset for 289 patients at St.Judes (~150 with Phlike or Ph+) in which to use to train and test our classifier. The classifier aims to seperate patients into one of four categories: Phlike, ERG, ETV and Other (a miscellaneous category).
We have the RAW RNA-seq data for 195 patients taken at Lund University. The samples are for patients with paediatric B-Cell precursor acute lumphoblastic leukaemia (BCP ALL). Of these samples 127 (65%) had in-fram fusion genes observed. The Lund samples provide an additional group to the St.Judes which we call DUX4 (a recurrent IGH-DUX4 or ERG-DUX4 fusion).
Firstly, we need to read in our data, do some munging and sculpt it into a useable form.
library(edgeR)
## Loading required package: limma
library(limma)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(statmod)
library(RColorBrewer)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(biomaRt)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RUVSeq)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: EDASeq
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:gplots':
##
## space
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, regroup, slice
## Loading required package: XVector
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
##
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
##
## last
##
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
##
## id
normalisedcounts= read.csv(file="../../MCRI/BALL/ALL_FPKM_filtered_genes_log2signal_annot.txt",sep="\t",stringsAsFactors=FALSE,row.names=1)
#Just take the sample columns, leave out the other annotational columns
ncounts = normalisedcounts[,1:289]
#Normalise counts
#ncounts2 = t(apply(ncounts,1, function(x) (x - min(x))/(max(x) - min(x))))
#normalisedcounts[,1:289] = ncounts2
#Figure out the classes
#Make the groupings from Phlike and non phlike
pheno = strsplit2(colnames(ncounts),".Aligned",fixed=TRUE)[,1]
#Change underscore string to dots
pheno = gsub("___________",".",pheno)
pheno = gsub("__________",".",pheno)
pheno = gsub("_________",".",pheno)
pheno = gsub("________",".",pheno)
pheno = gsub("_______",".",pheno)
tmp = strsplit2(pheno,".",fixed=TRUE) #Split the string by .
#Make chained ifelse statement
targets = data.frame(Fusion = ifelse(grepl("Phlike", tmp[,1]) | grepl("PHALL", tmp[,1]), "Phlike",
ifelse(grepl("ERG", tmp[,1]), "ERG",
ifelse(grepl("ETV", tmp[,1]),"ETV","Other"))),
Sex = ifelse(t(ncounts["XIST",] > 3) , "F", "M") #t() transpose into a vector of rows, rather than columns
)
targets$Sample = pheno
#Rename samples
colnames(normalisedcounts)[1:289] = pheno
targets$Original_Class = targets$Fusion
targets$Original_Class = ifelse(targets$Fusion == "Other",tmp[,1],as.character(targets$Fusion))
#head(targets)
ntargets=targets
Gather the Lund data
#Now read in Lund data
lcounts= read.table(file="../../MCRI/BALL/EGAD00001002112/counts.txt",sep="\t",stringsAsFactors=FALSE,row.names=1,header=TRUE)
colnames(lcounts) = gsub('.*_EGAR.*_C','C',colnames(lcounts))
colnames(lcounts) = gsub('.Aligned.out.bam','',colnames(lcounts))
colnames(lcounts) = gsub('_0*','_',colnames(lcounts))
llength = data.frame(Length = lcounts[,'Length'])
#Make DGE list
y.keep <- DGEList(counts=lcounts[,6:200])
#Remove XIST to avoid gender differences
#remove <- row.names(y.keep) == "XIST"
#y.keep = y.keep[!remove,]
y.keep = calcNormFactors(y.keep) #Normalise the raw counts
#Extract gene lengths
FPKM <- rpkm(y.keep,normalized.lib.sizes=TRUE,log=TRUE,prior.count=0.25,gene.length = lcounts$Length)
#FPKM <- rpkm(y.keep,log=TRUE,gene.length = lcounts$Length)
FPKM <- data.frame(FPKM)
FPKM$Gene_Symbol = row.names(FPKM)
#head(FPKM)
#Lund sample data
lsamp= read.csv(file="../../MCRI/BALL/Supplementary_Lund/ncomms11790-s4.csv",stringsAsFactors=FALSE,row.names=1,skip=1)
#Include HyperDiploidy as fourth category
#ltargets = data.frame(Fusion = ifelse(grepl("BCR-ABL1", lsamp[,2]) | grepl("Ph-like", lsamp[,2]), "Phlike",
# ifelse(grepl("DUX4", lsamp[,2]),"ERG", ifelse(grepl("hyperdiploidy", lsamp[,2]), "HyperDiploidy",
# ifelse(grepl("ETV", lsamp[,2]),"ETV","Other")))))
#Remove hyperdiploidy from categories
ltargets = data.frame(Fusion = ifelse(grepl("BCR-ABL1", lsamp[,2]) | grepl("Ph-like", lsamp[,2]), "Phlike",
ifelse(grepl("DUX4", lsamp[,2]),"ERG",
ifelse(grepl("ETV", lsamp[,2]),"ETV","Other"))))
ltargets$Sample = paste0("Case_",row.names(lsamp))
ltargets$Original_Class = ifelse(ltargets$Fusion == "Other",lsamp[,2],as.character(ltargets$Fusion))
#Now re-order the targets to match the FPKM/y.keep
m <- match(colnames(y.keep$counts),ltargets$Sample)
m <- m[!is.na(m)]
ltargets <- ltargets[m,] #Reorder
#head(ltargets)
Now we wish to combine both datasets together only including samples from the class we wish to classify.
#Lets use dplyr to merge the data based on the shared gene symbols
all = inner_join(normalisedcounts,FPKM,by="Gene_Symbol")
row.names(all) <- all$Gene_Symbol
alls = all[,c(1:289,296:490)]
#Combine sample fusion info
all_samp = rbind(targets[,c(1,3,4)],ltargets)
In order to visualise the two datasets together and to understand how significant the batch effects are we want to produce PCA plots and colour them variously.
#Define a new colour palette with enough colours for the nine different categories
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:5)
batch <- c(rep(1,289),rep(2,195))
plotMDS(alls,top=1000,gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('center',legend=unique(levels(all_samp$Fusion)),fill=pal,cex=0.8)
legend('top',legend=c("St.Judes","Lund"),pch=pchvec)
Colour by batch
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
#First 2 dimensions, colour by fusion
plotMDS(alls,top=1000,gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])
Now, we clearly have some batch effects in our data…. But can we try to remove the affects by using RUVseq
design <- model.matrix(~all_samp$Fusion)
fit <- lmFit(alls,design)
efit <- eBayes(fit[,-1],trend=TRUE,robust=TRUE) #Remove the intercept [,-1]
ranked_p = rank(-efit$F.p.value)
my_genes = row.names(efit)[ranked_p<=1000]
differences <- makeGroups(all_samp$Fusion)
#ruv <- RUVg(as.matrix(alls),cIdx=mygenes,k=1,isLog=TRUE)
#Ruvs with all genes using replicates
ruv <- RUVs(as.matrix(alls),cIdx=row.names(efit),k=2,differences,isLog=TRUE)
Now lets see how big the batch effects are:
res2= ruv$normalizedCounts#Remove batch
#res2=alls #Keep in batch effects
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
#First 2 dimensions, colour by fusion
plotMDS(res2,top=1000,gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])
legend('bottomleft',legend=c("St.Judes","Lund"),fill=pal,pch=pchvec)
So in the above plot, we can see that we have removed hopefully alot of the batch effects without removing too much of the biology….
Looking at just the biological variation:
#Define a new colour palette with enough colours for the nine different categories
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2,top=1000,gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(all_samp$Fusion)),fill=pal,pch=pchvec,cex=0.8)
legend('topleft',legend=c("St.Judes","Lund"),pch=pchvec)
Lets also have a look at the sub-classes of Other to see if there is some clustering there:
n <- 20
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
color = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplotColors <- function(g){
d <- 360/g
h <- cumsum(c(15, rep(d,g - 1)))
hcl(h = h, c = 100, l = 65)
}
color = ggplotColors(16)
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2,gene.selection="common",cex=0.8,col=color[factor(all_samp$Original_Class)],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(factor(all_samp$Original_Class))),fill=color,pch=16,cex=0.5,ncol=2)
legend('topleft',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)
So we can remove the batch effects if we wish too, however in order to have a more robust classifier we can keep them in! Now lets train on all the genes!!
bnorm = res2
#######################################################
#Find the 1000 most variable genes in the entire dataset
#######################################################
#Probably a bad strategy since the highest variance is associated with batch......
#vars <-apply(bnorm,1,sd)
#sort
#vars <- sort(vars,decreasing=TRUE)
#Now just select the 5000 most variable genes, based on the names of the genes
#datExpr <- bnorm[names(vars[1:10000]),]
##########################################################
# Find genes which are the most variable in each set
# since the largest variation in each set is biology
# Use the overlap of the too
##########################################################
#High variable lund
var_lund <-apply(lcounts[6:200],1,sd)
#sort
var_lund <- sort(var_lund,decreasing=TRUE)
#Highly variable St.Judes
var_jude <-apply(ncounts,1,sd)
#sort
var_jude <- sort(var_jude,decreasing=TRUE)
comb_var = intersect(names(var_lund)[1:2000],names(var_jude)[1:2000])
#Now just select the 5000 most variable genes, based on the names of the genes
datExpr <- bnorm[comb_var,]
#Only the non "Other" samples
#cands <- all_samp[all_samp$Fusion != "Other",]
#Remove the High Hyperdiploidy samples which are possibly confounded
#cands <- all_samp[all_samp$Original_Class != "High hyperdiploidy",]
#Include the others too!
cands <- all_samp
datExpr2 <- datExpr[,cands$Sample]
Now divvy up the entire dataset into training data (80% of samples) and test data (20 % of samples) in order to construct and test our random forest classifier.
#Now subset entire dataset into training and test datasets
#sample some random rows
resp <- cands$Fusion
resp <- as.data.frame(resp)
resp$resp <- factor(resp$resp)
#Transpose datExpr to be in sample by gene
datExpr3 <- t(datExpr2)
nTest = ceiling(nrow(datExpr3) * 0.2) #Take 80% of the rows randomly from the dataset for testing
set.seed(1) #Set a seed for reproducibility
ind = sample(nrow(datExpr3),nTest,FALSE) #Sample a random set of indices comprising 20%
df.train = datExpr3[-ind,] #Take last 80% for training
df.test = datExpr3[ind,] #Using residuals
#df.test = d3[ind,] #Take first 20% for testing, using FPKM not batch normalised
classtrain = resp[-ind,]
classtest = resp[ind,]
Use cross validation to decided on what (hyper) parameters to use for mtry (the number of variables to subset on for each node) and ntree (the number of trees to use when construct the random forest). A 5 fold cross validation is used.
#Now cross-validate RF to do feature selection and find mtry,ntree e.t.c
result <- rfcv(data.frame(df.train), classtrain, cv.fold=5)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
This demonstrates that checking roughly 50 genes gives a decent error. Now we can tune.
#Now tuneRF to find mtry and ntree
tuneRF(data.frame(df.train),classtrain,mtryStart=50,stepFactor=0.5)
## mtry = 50 OOB error = 8.53%
## Searching left ...
## mtry = 100 OOB error = 6.46%
## 0.2424242 0.05
## mtry = 200 OOB error = 7.75%
## -0.2 0.05
## Searching right ...
## mtry = 25 OOB error = 8.53%
## -0.32 0.05
## mtry OOBError
## 25.OOB 25 0.08527132
## 50.OOB 50 0.08527132
## 100.OOB 100 0.06459948
## 200.OOB 200 0.07751938
Looks like 80 is actually a pretty good value. So lets build a random forest from all most variable genes in the datasets with the parameters we have cross validated.
#Now train the random forest
set.seed(1) #Set the seed in order to gain reproducibility
#Random forest only accepts a data.frame, not a matrix, hence conver
RF1 = randomForest(classtrain~., data=data.frame(df.train),importance=T,mtry=100,proximity=TRUE)
RF1
##
## Call:
## randomForest(formula = classtrain ~ ., data = data.frame(df.train), importance = T, mtry = 100, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 100
##
## OOB estimate of error rate: 5.68%
## Confusion matrix:
## ERG ETV Other Phlike class.error
## ERG 49 1 1 1 0.05769231
## ETV 0 91 0 0 0.00000000
## Other 0 3 97 6 0.08490566
## Phlike 0 2 8 128 0.07246377
Now we can extract the most important variables used in the random forest classification.
#Extract variable importance
var.imp = data.frame(importance(RF1))
#Looking into the importance of different variables
varImpPlot(RF1)
#Extract Gini Importance
var.imp = data.frame(importance(RF1))
RFImportanceGini=var.imp$MeanDecreaseGini
#Looking at top Genes
set.seed(1)
NumberImportantGenes=c(1:100)
OOBerror=rep(NA, length(NumberImportantGenes) )
for (i in c(1:length(NumberImportantGenes)) ) {
topGenes= rank(-RFImportanceGini, ties.method="first" ) <= i
#Since the variables are highly correlated, we choose a small value for mtry (mtry=1).
RF2 <- randomForest(classtrain~., data=data.frame(df.train[,topGenes]), ntree=201,
importance=F, mtry=1)
OOBerror[i]=1-sum(diag(table(RF2$predicted,classtrain)))/length(classtrain)
}
plot(NumberImportantGenes,OOBerror ,cex=1.5,xlab="Number of Top Most
Important Genes", ylab="RF Error Rate",cex.lab=1.5 )
Furthermore we can see the clustering of the various groups using the proximity matrix of the random forest to see how well the groups seperate.
#View MDS
MDSplot(RF1, classtrain,palette=pal)
legend("bottomleft",legend=unique(levels(classtrain)),fill=pal,cex=0.7)
So as we can see almost all of the dataset is described by fourty-two genes, so in order to have a more general model we can simply build a random forest on the those fourty two genes:
#Get and save top gene names
mDGini = var.imp[,"MeanDecreaseGini",drop=FALSE]
topGeneNames = mDGini[order(-mDGini$MeanDecreaseGini),,drop=FALSE]
#Just take top 20
topGeneNames = topGeneNames[1:20,,drop=FALSE]
topgenes = row.names(topGeneNames)
topgenes=gsub("\\.","-",topgenes) #Weird bug which switches a dot with a dash when using RF???
#Cross validating the number of features to use
rfcv(df.train[,topgenes],classtrain,cv.fold=5,step=0.5)
## $n.var
## [1] 20 10 5 2 1
##
## $error.cv
## 20 10 5 2 1
## 0.08010336 0.08527132 0.24289406 0.38501292 0.58397933
##
## $predicted
## $predicted$`20`
## [1] Other ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [11] ERG ETV ERG ERG ERG ERG ERG ERG ERG ERG
## [21] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [31] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [41] ERG ERG Other ERG ERG ERG ERG ETV ETV ETV
## [51] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [61] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [71] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [81] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [91] ETV Other Other Other Other Other Phlike Phlike ETV Other
## [101] Other Other Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [111] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [121] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [131] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [141] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [151] Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike Phlike
## [161] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [171] Phlike Phlike Phlike ETV Phlike Phlike Phlike Phlike Phlike Phlike
## [181] ETV Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike
## [191] Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike
## [201] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [211] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [221] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [231] Phlike Other Other ETV ETV Other Phlike ETV Other Other
## [241] Other Other Other Other Other Other Other Other Other ETV
## [251] Phlike Other Other Other ETV ETV ETV ERG ETV Other
## [261] Other Phlike ETV Other Other Other Phlike Other Other Other
## [271] ETV Phlike ETV ERG Other Phlike ETV Other Other ETV
## [281] Phlike Other ETV Other ETV Other Other ETV Other Other
## [291] Other Other Phlike ETV Other Other Other ETV Phlike ETV
## [301] Other ETV Other ETV ETV ETV Other Other Other Other
## [311] Other Other Other Other Other Phlike Other Other Other Other
## [321] ETV Other Phlike ETV Other Other ETV ETV Other Other
## [331] Phlike Phlike ETV ERG Other ETV Other Other ETV Other
## [341] ETV ETV Other ETV Other Other Other ETV Other Other
## [351] Other ETV Other Other ETV Other Other Phlike Other Other
## [361] Other Other Other Other Other Other Other ETV Phlike Other
## [371] ETV Other ERG ETV ETV ERG Other Other ETV ETV
## [381] ETV ETV Other Other ETV Other Other
## Levels: ERG ETV Other Phlike
##
## $predicted$`10`
## [1] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [11] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [21] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [31] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [41] ERG ERG ERG ERG ERG ERG ERG ETV ETV ETV
## [51] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [61] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [71] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [81] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [91] ETV Other Other Other Other Other Phlike Phlike ETV Other
## [101] Other Other Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [111] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [121] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Other Phlike
## [131] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [141] Phlike Phlike Other Phlike Phlike Phlike Other Phlike Phlike Phlike
## [151] Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike Phlike
## [161] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [171] Phlike Phlike Phlike ETV Phlike Phlike Phlike Phlike Phlike Phlike
## [181] ETV Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [191] Phlike Other Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike
## [201] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [211] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [221] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [231] Phlike Phlike Other ETV ETV Other Phlike ETV Other Other
## [241] Other Other Other Other Other Other Other Other Other ETV
## [251] Other Other Other Other ETV ETV ETV ERG ETV Other
## [261] Other Phlike ETV Other Other Other Phlike Other Other Other
## [271] ETV Phlike ETV ERG Phlike Other ETV Other Other ETV
## [281] Other Other ETV Other ETV Other Other ETV Phlike Other
## [291] Other Other Phlike ETV Other Other Other ETV Phlike ETV
## [301] Other ETV Other ETV ETV ETV Other Other Other Other
## [311] Other Other Other Other Other ETV Other Other Other Other
## [321] ETV Other Other ETV Other Other ETV ETV Other Other
## [331] ETV Other ETV ERG Phlike ETV Other Other ETV Other
## [341] ETV ETV Other ETV ETV Other Other ETV Other Other
## [351] Other ETV Phlike Other ETV Other Other Phlike Other Other
## [361] Other Other Other Other Other Other Other ETV Phlike Other
## [371] ETV Other ERG ETV ETV ERG Other Other ETV ETV
## [381] ETV ETV Other Other ETV Other Other
## Levels: ERG ETV Other Phlike
##
## $predicted$`5`
## [1] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [11] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [21] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [31] ERG ERG ERG ERG ERG ERG ERG ERG Other ERG
## [41] ERG ERG Other ERG ERG ERG Other ETV ETV Other
## [51] ETV ETV ETV ETV ETV ETV ETV ETV ETV ETV
## [61] ETV ETV Phlike Other Other Other Other Other ETV ETV
## [71] ETV ETV ETV ETV Phlike ETV Phlike Other ETV ETV
## [81] ETV Phlike Other ETV ETV ETV ETV Other ETV ETV
## [91] ETV Other Other Other ETV Other ETV Phlike Phlike Other
## [101] Other Phlike Other Phlike Other ETV Phlike Phlike Other ETV
## [111] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [121] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [131] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [141] Phlike Phlike Other Phlike Phlike Phlike Other Phlike Phlike Phlike
## [151] Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike ETV
## [161] Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [171] Phlike Phlike Phlike ETV Phlike Phlike Phlike Phlike Phlike Phlike
## [181] ETV Phlike Phlike Phlike Other Phlike Other Phlike Phlike Phlike
## [191] Phlike ETV Phlike Phlike Phlike Phlike ETV Phlike Phlike Phlike
## [201] Phlike Phlike Other Phlike Other Phlike Other Phlike Phlike ETV
## [211] Other Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike Phlike
## [221] ETV Phlike Phlike Phlike Phlike Phlike ETV Phlike Phlike Phlike
## [231] Phlike Phlike ETV Other ETV ETV Other ETV Other Phlike
## [241] Other Other Other Other Other ETV Phlike Other Other Other
## [251] Other Phlike Other Other ETV ETV ETV ERG Other Other
## [261] Other Phlike ETV Other ETV Other Phlike Other Phlike Other
## [271] ETV Phlike ETV ERG Phlike Phlike Phlike Other Other Phlike
## [281] Phlike Other ETV Other ETV Other Other ETV Other Other
## [291] ETV Other Phlike ETV Other Other Other Phlike Phlike Other
## [301] Other ETV Other ETV ETV Phlike ERG Other Other Other
## [311] ETV Other ETV ETV Other Other Other ETV Other Other
## [321] ETV Other ETV ETV Other ETV ETV ETV Phlike Other
## [331] ETV Other ETV ERG Phlike ETV ETV Other ETV Other
## [341] ETV ETV Other ETV Other Other ETV ETV Other ERG
## [351] ETV ETV Phlike Other ETV Other Other Phlike Other Other
## [361] Other Other Other Phlike Other Other Other Phlike Phlike ETV
## [371] ETV ERG ERG ETV ETV ERG Phlike ETV ETV Other
## [381] ETV Phlike ETV Other ETV ETV Other
## Levels: ERG ETV Other Phlike
##
## $predicted$`2`
## [1] ETV ERG ERG ERG ERG ERG ERG ERG Other ERG
## [11] ERG ERG ERG ETV ERG ERG ERG ERG ERG ERG
## [21] ERG ERG ERG ERG ERG ERG ERG ERG Other ERG
## [31] ERG ERG ERG ERG ERG ERG ERG ERG ERG ERG
## [41] ERG ERG ERG Other ERG ERG ETV ETV ETV ETV
## [51] ETV Phlike ETV ETV Phlike Other ETV ETV Phlike ETV
## [61] ERG ERG ETV ETV Other Other Phlike Other ETV ETV
## [71] ETV Other ETV Phlike Phlike ETV Phlike Other Phlike ETV
## [81] Other Phlike Other Phlike ETV ETV ETV Other ETV ETV
## [91] Other Other Other ETV ETV ETV Phlike Phlike Other Other
## [101] Other Other ETV Phlike Other Other Phlike Other Other ETV
## [111] Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike ETV Phlike
## [121] Other Phlike Other ERG Phlike Phlike Phlike ETV Phlike Phlike
## [131] Phlike Phlike Other Phlike ETV Other Phlike Phlike Phlike Phlike
## [141] Phlike Phlike Other Phlike Phlike Phlike ETV Phlike Other Phlike
## [151] Phlike Phlike Other Phlike Phlike Phlike Phlike Phlike Phlike ETV
## [161] Phlike Other Other Phlike Phlike Phlike Phlike ETV Phlike Phlike
## [171] Other Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Other
## [181] Phlike ETV Other Phlike Phlike Phlike ETV ETV Phlike Other
## [191] Phlike ETV Other Phlike Other Other Phlike Phlike Phlike Phlike
## [201] Phlike Phlike Other Phlike Phlike ETV Phlike Phlike Phlike Other
## [211] ETV Phlike Phlike Phlike ETV Phlike ETV Phlike ETV Other
## [221] ETV Phlike ETV Phlike ETV Phlike Phlike Phlike ETV Phlike
## [231] Phlike Phlike ETV Other ERG Phlike Other Phlike ETV Phlike
## [241] ETV ETV Other Other ERG Other Other Phlike Other Other
## [251] Phlike Other Other Other Other Phlike Phlike ERG Other Phlike
## [261] ERG ETV ETV Phlike Phlike Other Phlike Other Phlike ETV
## [271] Phlike ETV ETV ERG Phlike Phlike Other Other Other Phlike
## [281] Phlike Other ETV Other ETV Other Phlike ETV Phlike Other
## [291] Phlike ETV Phlike ETV Other Other Other Phlike ETV Phlike
## [301] ETV ETV Phlike ETV Phlike ETV ERG Other ETV Other
## [311] ETV Phlike Phlike ETV Phlike Phlike Phlike Other Other Phlike
## [321] ETV Other ETV ERG Other ETV ETV ETV Other Other
## [331] ETV Phlike ETV ERG Phlike ETV Other ETV ETV Other
## [341] ETV Other Other Phlike Other Other Other ETV Other ETV
## [351] Other ETV Phlike Phlike ETV Other Other ERG Other ERG
## [361] Phlike Other Phlike Other Other Phlike Other Phlike Phlike ETV
## [371] ETV ERG Other ETV ERG ERG Other Other ETV ETV
## [381] Other ETV ETV Other ETV Other Other
## Levels: ERG ETV Other Phlike
##
## $predicted$`1`
## [1] ETV ETV Other Phlike ERG ERG ERG ERG Other ERG
## [11] ERG ERG ERG ETV ERG ERG ERG ERG ERG ERG
## [21] ERG ERG ERG ERG ERG ETV Phlike ERG ETV ERG
## [31] Phlike ERG Phlike ERG ERG ERG ERG ERG ETV ERG
## [41] ERG ERG ERG Phlike ERG ERG ERG Other Other Other
## [51] ETV Phlike ETV ETV Phlike Other ETV ETV Phlike Phlike
## [61] Phlike ERG Other ETV Other Other Phlike Phlike Other Other
## [71] ETV Other ETV Other ETV Phlike Phlike ETV Phlike ETV
## [81] Phlike Phlike ETV ETV Other ETV ETV ETV ETV ERG
## [91] Other Phlike Phlike ETV Other Phlike Other Phlike Other Other
## [101] Other ETV ETV Phlike Phlike ETV Phlike Phlike ETV ETV
## [111] Other Phlike Phlike Other Other Other Phlike Phlike Phlike ETV
## [121] ETV Phlike ETV ERG ETV ETV ETV ETV ETV Phlike
## [131] Phlike Phlike Phlike Other Other ETV Other Other Other ETV
## [141] Other Phlike Phlike Phlike Phlike Phlike Phlike Other ETV Phlike
## [151] Phlike ETV Phlike Other Other Other Phlike Other Other Other
## [161] Other Other ETV Phlike Other Phlike Phlike Other Phlike Phlike
## [171] ETV Phlike Phlike Phlike Phlike Phlike Other Phlike Phlike Phlike
## [181] Phlike ETV Phlike Phlike Phlike Other Phlike ETV Phlike Phlike
## [191] ETV Other Other Phlike Phlike ETV ETV Other Other ETV
## [201] Phlike Other Phlike ETV Phlike ETV Phlike Phlike Other ETV
## [211] ETV ETV Other ETV ETV Other ETV Other Phlike Phlike
## [221] ETV ERG Other Phlike ETV Phlike Phlike ETV ETV Phlike
## [231] Other Other ETV Phlike ETV Phlike ETV ERG Phlike Phlike
## [241] Other ETV Phlike Other ERG Phlike Other Phlike ETV Other
## [251] ETV Phlike Other Phlike Other Other Other ETV Other ETV
## [261] Other ETV Phlike Phlike Other Phlike ETV Other ETV ETV
## [271] Phlike Phlike ETV ERG Phlike Other Phlike Phlike Other Phlike
## [281] ETV Other ETV Phlike ETV Other Phlike Phlike ETV ETV
## [291] Phlike ETV Phlike Phlike Phlike ETV Phlike Phlike Phlike Phlike
## [301] Phlike Phlike Phlike Phlike Other Phlike ERG Phlike ETV Other
## [311] Other Phlike Phlike ETV ETV Other Other Other Other Phlike
## [321] ETV Other Phlike Phlike Phlike ETV Phlike Phlike Phlike Other
## [331] Other Phlike Other ERG Phlike ETV ETV ETV Phlike ETV
## [341] Other Other Other Other Other Other Phlike ETV Other Phlike
## [351] Other Phlike Phlike Phlike ETV Other Other ERG Other ERG
## [361] Phlike Phlike Other Phlike Phlike ETV Other Other Phlike Phlike
## [371] ETV ERG Other ETV ERG ERG Other Other ETV ETV
## [381] Other ETV ETV Phlike ETV Phlike ETV
## Levels: ERG ETV Other Phlike
#Tune for mtry
tuneRF(df.train[,topgenes],classtrain,stepFactor = 0.5,ntreeTry=401,mtryStart=5)
## mtry = 5 OOB error = 5.94%
## Searching left ...
## mtry = 10 OOB error = 6.98%
## -0.173913 0.05
## Searching right ...
## mtry = 2 OOB error = 6.2%
## -0.04347826 0.05
## mtry OOBError
## 2.OOB 2 0.06201550
## 5.OOB 5 0.05943152
## 10.OOB 10 0.06976744
set.seed(1)
RF2 <-randomForest(classtrain~., data=data.frame(df.train[,topgenes]), importance=T, mtry=5, proximity=T)
RF2
##
## Call:
## randomForest(formula = classtrain ~ ., data = data.frame(df.train[, topgenes]), importance = T, mtry = 5, proximity = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 6.46%
## Confusion matrix:
## ERG ETV Other Phlike class.error
## ERG 50 1 1 0 0.03846154
## ETV 0 90 1 0 0.01098901
## Other 0 2 94 10 0.11320755
## Phlike 0 2 8 128 0.07246377
#Save the RF object and topgenes
save(RF2,file="RF_model.Rda")
save(topgenes,file="topGeneNames.Rda")
Now lets explore our random forest to see if it is doing what we think and want, so lets again consider the MDs plot made from the proximity matrix, but also just look at a heatmap based on a hierachical clustering of the top 20 genes used in our random forest as a cross-check:
topdata = res2[topgenes,]
yx <- as.matrix(topdata)
heatmap.2(yx,colCol=pal[all_samp$Fusion],trace = "none",col =brewer.pal(11,"Spectral"))
#Check how the MDS plot looks like with just these top genes
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(all_samp$Fusion)),fill=pal,pch=16)
legend('bottomleft',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)
But the question is, do the subclasses of other classify?
#Check how the MDS plot looks like with just these top genes
#color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)]
color = ggplotColors(16)
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=color[factor(all_samp$Original_Class)],pch=pchvec[batch])
legend('bottomleft',legend=unique(levels(factor(all_samp$Original_Class))),fill=color,pch=16,cex=0.5,ncol=2)
legend('bottomright',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)
Nowe we also just want to check whether the topgenes are biased to batches
#Check how the MDS plot looks like with just these top genes
pal = brewer.pal(9,"Set1")
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])
legend('topright',legend=c("St.Judes","Lund"),fill=pal,pch=pchvec)
#Time to Test Now we wish to understand how well the randomForest does on predicting on the test dataset.
RF2pred <- predict(RF2,data.frame(df.test))
RF2prob <-predict(RF2,data.frame(df.test),type="prob")
So as you can see the randomForest is doing a reasonable job and does what we think! Lets visualise in a confusion matrix:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
#First make matrix
conf = RF2$confusion[,c(1:4)]
#Now lets normalise
#conf.norm = apply(conf,2, function(x) x/sum(x))
#Rejig matrix to table then to dataframe
confusion = as.data.frame(as.table(conf))
values = c(0.05,1.0)
gg <- ggplot(confusion,aes(x=Var1,y=Var2,fill=Freq))
gg + geom_tile() + labs(fill="Frequency",x="Truth",y="Predicted") + geom_text(aes(label=Freq),show.legend = FALSE,colour="white")
out = data.frame(RF2prob)
out = cbind(out,RF2pred)
out$True = classtest
Maj = apply(RF2prob[,1:4],1,max)
out$Maj = Maj
head(out)
## ERG ETV Other Phlike RF2pred True
## iAMP21.SJBALL021900_D1.TB.95.0559 0.078 0.004 0.616 0.302 Other Other
## Phlike.SJBALL021076_D1.PASZCR 0.000 0.000 0.056 0.944 Phlike Phlike
## PHALL.SJPHALL020037_D1.PANLSC 0.160 0.004 0.050 0.786 Phlike Phlike
## Case_148 0.004 0.002 0.920 0.074 Other Other
## ETV.SJETV089_D.TB.06.1784 0.008 0.900 0.054 0.038 ETV ETV
## Case_142 0.006 0.854 0.014 0.126 ETV ETV
## Maj
## iAMP21.SJBALL021900_D1.TB.95.0559 0.616
## Phlike.SJBALL021076_D1.PASZCR 0.944
## PHALL.SJPHALL020037_D1.PANLSC 0.786
## Case_148 0.920
## ETV.SJETV089_D.TB.06.1784 0.900
## Case_142 0.854
#First make matrix
conf = table(out[,c(5,6)])
print(conf)
## True
## RF2pred ERG ETV Other Phlike
## ERG 11 0 0 0
## ETV 0 17 0 0
## Other 0 0 28 3
## Phlike 0 0 2 36
#Rejig matrix to table then to dataframe
confusion = as.data.frame(as.table(conf))
gg <- ggplot(confusion,aes(x=RF2pred,y=True,fill=Freq))
gg + geom_tile() + labs(fill="Frequency") + geom_text(aes(label=Freq),show.legend = FALSE,colour="white")
So now the question we want to address is, what happens when we have a sample whom doesn’t fit into one of the above four samples. Well, in that case we wish the sample to be called as “Unclassified”. So at what probability threshold should we consider something as unclassified rather than one of the other classes?
#Lets take the test set and the "Other" samples and see what goes down
cands <- all_samp[all_samp$Fusion == "Other",]
autre <- datExpr[topgenes,cands$Sample]
#If we excluded the Others add them in
#combo = rbind(df.test[,topgenes],t(autre))
#Otherwise
combo = df.test[,topgenes]
RF2pred_autre <- predict(RF2,data.frame(combo))
RF2prob_autre <-predict(RF2,data.frame(combo),type="prob")
autre_out = data.frame(RF2prob_autre)
autre_out = cbind(autre_out,RF2pred_autre)
#If excluded Others add them in
#autre_out$True = c(as.character(classtest),as.character(cands$Fusion))
#otherwise
autre_out$True = c(as.character(classtest))
Maj = apply(RF2prob_autre[,1:4],1,max)
autre_out$Prob = Maj
How about class specific thresholds?
#Find MLL samples as litmus tests
mlls= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "MLL",])
gg <- ggplot(autre_out)
#ETV
gg + geom_density(aes(ETV),col="red") +
geom_vline(xintercept=autre_out["Case_22","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_58","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","ETV"],col="purple",linetype=2) +
geom_vline(xintercept=autre_out[autre_out$True=="ETV","ETV"],col="yellow")
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
#ERG
gg + geom_density(aes(ERG),col="blue") + geom_vline(xintercept=autre_out["Case_10","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_163","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out[autre_out$True=="ERG","ERG"],col="yellow")
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
#Phlike
gg + geom_density(aes(Phlike),col="green") + geom_vline(xintercept=autre_out["Case_10","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_163","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","Phlike"],col="purple",linetype=2) +
geom_vline(xintercept=autre_out[autre_out$True=="Phlike","Phlike"],col="yellow")
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
#Hyperdiploidy
#hyper= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "HyperDiplody",])
#hyp = gg + geom_density(aes(HyperDiploidy),col="black") + geom_vline(xintercept=autre_out["Sample_10","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_170","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_163","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_152","HyperDiploidy"],col="purple",linetype=2) +
#geom_vline(xintercept=autre_out["Sample_23","HyperDiploidy"],col="yellow",linetype=1) +
#geom_vline(xintercept=autre_out[autre_out$True=="HyperDiploidy","HyperDiploidy"],col="yellow")
#hyp
#Other
hyper= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "Other",])
hyp = gg + geom_density(aes(Other),col="black") +
geom_vline(xintercept=autre_out[autre_out$True=="Other","Other"],col="yellow") + geom_vline(xintercept=autre_out["Case_163","Other"],col="purple",linetype=2)
hyp
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
prob_plot = function(class_name,df){
dat = df[df$True==class_name,]
dat = dat[order(-dat[,class_name]),]
dat$Sample = row.names(dat)
dat= dat %>% gather(class,prob,ERG:Phlike)
my.order = unique(dat$Sample)
dat$Sample <- factor(dat$Sample, levels = my.order)
#classes = unique(dat$class)
#dat$class <- factor(dat$class,levels=c(class_name,classes[classes!=class_name]))
lab_cols = ifelse(dat$batch == "Lund","#f9b52c","#2cabf9")
gg <- ggplot(dat)
gg + geom_bar(aes(x=Sample,y=prob,fill=class),stat='identity') + theme_bw() + theme(axis.text.x=element_blank(),axis.ticks.x=element_line(color=lab_cols,size=5),axis.title.y=element_text(size=18),axis.title.x=element_text(size=18), title=element_text(size=18), legend.position="none") + labs(y="Probability", title=paste0("Truth Class ",class_name))
}
prob_plot = function(class_name,df){
dat = df[df$True==class_name,]
dat = dat[order(-dat[,class_name]),]
dat$Sample = row.names(dat)
dat= dat %>% gather(class,prob,ERG:Phlike)
my.order = unique(dat$Sample)
dat$Sample <- factor(dat$Sample, levels = my.order)
#classes = unique(dat$class)
#dat$class <- factor(dat$class,levels=c(class_name,classes[classes!=class_name]))
lab_cols = ifelse(dat$batch == "Lund","#f9b52c","#2cabf9")
gg <- ggplot(dat)
gg + geom_bar(aes(x=Sample,y=prob,fill=class),stat='identity') + theme_bw() + theme(axis.text.x=element_blank(),axis.ticks.x=element_line(color=lab_cols,size=5),axis.title.y=element_text(size=18),axis.title.x=element_text(size=18), title=element_text(size=18), legend.position="none") + labs(y="Probability", title=paste0("Truth Class ",class_name))
}
#Add a variable to discriminate whether batch is from Lund or St.Judes
autre_out$batch = ifelse(grepl("Case",row.names(autre_out)),"Lund","St.Judes")
prob_plot('Phlike',autre_out)
prob_plot('ETV',autre_out)
prob_plot('ERG',autre_out)
prob_plot('Other',autre_out)